In [12]:
!pip install seaborn
!pip install plotly
!pip install numpy
Requirement already satisfied: seaborn in c:\users\abane\music\da_prject internship\env\lib\site-packages (0.13.2) Requirement already satisfied: numpy!=1.24.0,>=1.20 in c:\users\abane\music\da_prject internship\env\lib\site-packages (from seaborn) (2.3.3) Requirement already satisfied: pandas>=1.2 in c:\users\abane\music\da_prject internship\env\lib\site-packages (from seaborn) (2.3.3) Requirement already satisfied: matplotlib!=3.6.1,>=3.4 in c:\users\abane\music\da_prject internship\env\lib\site-packages (from seaborn) (3.10.6) Requirement already satisfied: contourpy>=1.0.1 in c:\users\abane\music\da_prject internship\env\lib\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.3.3) Requirement already satisfied: cycler>=0.10 in c:\users\abane\music\da_prject internship\env\lib\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (0.12.1) Requirement already satisfied: fonttools>=4.22.0 in c:\users\abane\music\da_prject internship\env\lib\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (4.60.1) Requirement already satisfied: kiwisolver>=1.3.1 in c:\users\abane\music\da_prject internship\env\lib\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.4.9) Requirement already satisfied: packaging>=20.0 in c:\users\abane\music\da_prject internship\env\lib\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (25.0) Requirement already satisfied: pillow>=8 in c:\users\abane\music\da_prject internship\env\lib\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (11.3.0) Requirement already satisfied: pyparsing>=2.3.1 in c:\users\abane\music\da_prject internship\env\lib\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (3.2.5) Requirement already satisfied: python-dateutil>=2.7 in c:\users\abane\music\da_prject internship\env\lib\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (2.9.0.post0) Requirement already satisfied: pytz>=2020.1 in c:\users\abane\music\da_prject internship\env\lib\site-packages (from pandas>=1.2->seaborn) (2025.2) Requirement already satisfied: tzdata>=2022.7 in c:\users\abane\music\da_prject internship\env\lib\site-packages (from pandas>=1.2->seaborn) (2025.2) Requirement already satisfied: six>=1.5 in c:\users\abane\music\da_prject internship\env\lib\site-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.4->seaborn) (1.17.0)
[notice] A new release of pip is available: 24.0 -> 25.2 [notice] To update, run: python.exe -m pip install --upgrade pip
Requirement already satisfied: plotly in c:\users\abane\music\da_prject internship\env\lib\site-packages (6.3.1) Requirement already satisfied: narwhals>=1.15.1 in c:\users\abane\music\da_prject internship\env\lib\site-packages (from plotly) (2.6.0) Requirement already satisfied: packaging in c:\users\abane\music\da_prject internship\env\lib\site-packages (from plotly) (25.0)
[notice] A new release of pip is available: 24.0 -> 25.2 [notice] To update, run: python.exe -m pip install --upgrade pip
Requirement already satisfied: numpy in c:\users\abane\music\da_prject internship\env\lib\site-packages (2.3.3)
[notice] A new release of pip is available: 24.0 -> 25.2 [notice] To update, run: python.exe -m pip install --upgrade pip
In [9]:
!pip install matplotlib
Collecting matplotlib
Downloading matplotlib-3.10.6-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
Downloading contourpy-1.3.3-cp311-cp311-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
Downloading fonttools-4.60.1-cp311-cp311-win_amd64.whl.metadata (114 kB)
---------------------------------------- 0.0/114.6 kB ? eta -:--:--
--- ------------------------------------ 10.2/114.6 kB ? eta -:--:--
------------------- ----------------- 61.4/114.6 kB 825.8 kB/s eta 0:00:01
-------------------------------------- 114.6/114.6 kB 1.1 MB/s eta 0:00:00
Collecting kiwisolver>=1.3.1 (from matplotlib)
Downloading kiwisolver-1.4.9-cp311-cp311-win_amd64.whl.metadata (6.4 kB)
Requirement already satisfied: numpy>=1.23 in c:\users\abane\music\da_prject internship\env\lib\site-packages (from matplotlib) (2.3.3)
Requirement already satisfied: packaging>=20.0 in c:\users\abane\music\da_prject internship\env\lib\site-packages (from matplotlib) (25.0)
Collecting pillow>=8 (from matplotlib)
Using cached pillow-11.3.0-cp311-cp311-win_amd64.whl.metadata (9.2 kB)
Collecting pyparsing>=2.3.1 (from matplotlib)
Downloading pyparsing-3.2.5-py3-none-any.whl.metadata (5.0 kB)
Requirement already satisfied: python-dateutil>=2.7 in c:\users\abane\music\da_prject internship\env\lib\site-packages (from matplotlib) (2.9.0.post0)
Requirement already satisfied: six>=1.5 in c:\users\abane\music\da_prject internship\env\lib\site-packages (from python-dateutil>=2.7->matplotlib) (1.17.0)
Downloading matplotlib-3.10.6-cp311-cp311-win_amd64.whl (8.1 MB)
---------------------------------------- 0.0/8.1 MB ? eta -:--:--
--------------------------------------- 0.2/8.1 MB 5.0 MB/s eta 0:00:02
- -------------------------------------- 0.4/8.1 MB 4.5 MB/s eta 0:00:02
-- ------------------------------------- 0.6/8.1 MB 4.6 MB/s eta 0:00:02
--- ------------------------------------ 0.8/8.1 MB 4.6 MB/s eta 0:00:02
---- ----------------------------------- 1.0/8.1 MB 4.4 MB/s eta 0:00:02
----- ---------------------------------- 1.2/8.1 MB 4.4 MB/s eta 0:00:02
------- -------------------------------- 1.4/8.1 MB 4.6 MB/s eta 0:00:02
-------- ------------------------------- 1.6/8.1 MB 4.6 MB/s eta 0:00:02
--------- ------------------------------ 1.8/8.1 MB 4.5 MB/s eta 0:00:02
---------- ----------------------------- 2.1/8.1 MB 4.5 MB/s eta 0:00:02
----------- ---------------------------- 2.3/8.1 MB 4.6 MB/s eta 0:00:02
------------ --------------------------- 2.5/8.1 MB 4.6 MB/s eta 0:00:02
------------- -------------------------- 2.8/8.1 MB 4.7 MB/s eta 0:00:02
-------------- ------------------------- 3.0/8.1 MB 4.6 MB/s eta 0:00:02
--------------- ------------------------ 3.1/8.1 MB 4.7 MB/s eta 0:00:02
--------------- ------------------------ 3.2/8.1 MB 4.4 MB/s eta 0:00:02
---------------- ----------------------- 3.4/8.1 MB 4.3 MB/s eta 0:00:02
----------------- ---------------------- 3.6/8.1 MB 4.3 MB/s eta 0:00:02
------------------- -------------------- 3.9/8.1 MB 4.5 MB/s eta 0:00:01
-------------------- ------------------- 4.1/8.1 MB 4.5 MB/s eta 0:00:01
--------------------- ------------------ 4.4/8.1 MB 4.5 MB/s eta 0:00:01
---------------------- ----------------- 4.6/8.1 MB 4.5 MB/s eta 0:00:01
---------------------- ----------------- 4.6/8.1 MB 4.5 MB/s eta 0:00:01
------------------------- -------------- 5.2/8.1 MB 4.7 MB/s eta 0:00:01
-------------------------- ------------- 5.4/8.1 MB 4.6 MB/s eta 0:00:01
--------------------------- ------------ 5.6/8.1 MB 4.6 MB/s eta 0:00:01
---------------------------- ----------- 5.9/8.1 MB 4.7 MB/s eta 0:00:01
----------------------------- ---------- 6.1/8.1 MB 4.7 MB/s eta 0:00:01
------------------------------- -------- 6.3/8.1 MB 4.8 MB/s eta 0:00:01
-------------------------------- ------- 6.6/8.1 MB 4.7 MB/s eta 0:00:01
-------------------------------- ------- 6.6/8.1 MB 4.7 MB/s eta 0:00:01
---------------------------------- ----- 7.0/8.1 MB 4.7 MB/s eta 0:00:01
----------------------------------- ---- 7.2/8.1 MB 4.7 MB/s eta 0:00:01
------------------------------------ --- 7.3/8.1 MB 4.6 MB/s eta 0:00:01
------------------------------------ --- 7.5/8.1 MB 4.7 MB/s eta 0:00:01
-------------------------------------- - 7.8/8.1 MB 4.7 MB/s eta 0:00:01
--------------------------------------- 8.1/8.1 MB 4.8 MB/s eta 0:00:01
---------------------------------------- 8.1/8.1 MB 4.7 MB/s eta 0:00:00
Downloading contourpy-1.3.3-cp311-cp311-win_amd64.whl (225 kB)
---------------------------------------- 0.0/225.2 kB ? eta -:--:--
--------------------------------------- 225.2/225.2 kB 14.3 MB/s eta 0:00:00
Using cached cycler-0.12.1-py3-none-any.whl (8.3 kB)
Downloading fonttools-4.60.1-cp311-cp311-win_amd64.whl (2.3 MB)
---------------------------------------- 0.0/2.3 MB ? eta -:--:--
- -------------------------------------- 0.1/2.3 MB 2.6 MB/s eta 0:00:01
------- -------------------------------- 0.5/2.3 MB 4.7 MB/s eta 0:00:01
------------ --------------------------- 0.7/2.3 MB 5.5 MB/s eta 0:00:01
--------------- ------------------------ 0.9/2.3 MB 5.1 MB/s eta 0:00:01
------------------ --------------------- 1.0/2.3 MB 4.7 MB/s eta 0:00:01
------------------ --------------------- 1.0/2.3 MB 4.7 MB/s eta 0:00:01
------------------------- -------------- 1.5/2.3 MB 4.7 MB/s eta 0:00:01
----------------------------- ---------- 1.7/2.3 MB 4.7 MB/s eta 0:00:01
---------------------------------- ----- 1.9/2.3 MB 4.8 MB/s eta 0:00:01
-------------------------------------- - 2.2/2.3 MB 4.9 MB/s eta 0:00:01
--------------------------------------- 2.3/2.3 MB 4.7 MB/s eta 0:00:01
---------------------------------------- 2.3/2.3 MB 4.4 MB/s eta 0:00:00
Downloading kiwisolver-1.4.9-cp311-cp311-win_amd64.whl (73 kB)
---------------------------------------- 0.0/73.8 kB ? eta -:--:--
-------------------------------------- - 71.7/73.8 kB ? eta -:--:--
---------------------------------------- 73.8/73.8 kB 1.4 MB/s eta 0:00:00
Using cached pillow-11.3.0-cp311-cp311-win_amd64.whl (7.0 MB)
Downloading pyparsing-3.2.5-py3-none-any.whl (113 kB)
---------------------------------------- 0.0/113.9 kB ? eta -:--:--
---------------------------------------- 113.9/113.9 kB 3.3 MB/s eta 0:00:00
Installing collected packages: pyparsing, pillow, kiwisolver, fonttools, cycler, contourpy, matplotlib
Successfully installed contourpy-1.3.3 cycler-0.12.1 fonttools-4.60.1 kiwisolver-1.4.9 matplotlib-3.10.6 pillow-11.3.0 pyparsing-3.2.5
[notice] A new release of pip is available: 24.0 -> 25.2 [notice] To update, run: python.exe -m pip install --upgrade pip
In [1]:
!pip install pandas
Collecting pandas
Downloading pandas-2.3.3-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting numpy>=1.23.2 (from pandas)
Using cached numpy-2.3.3-cp311-cp311-win_amd64.whl.metadata (60 kB)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\abane\music\da_prject internship\env\lib\site-packages (from pandas) (2.9.0.post0)
Collecting pytz>=2020.1 (from pandas)
Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Requirement already satisfied: six>=1.5 in c:\users\abane\music\da_prject internship\env\lib\site-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)
Downloading pandas-2.3.3-cp311-cp311-win_amd64.whl (11.3 MB)
---------------------------------------- 0.0/11.3 MB ? eta -:--:--
---------------------------------------- 0.0/11.3 MB ? eta -:--:--
---------------------------------------- 0.0/11.3 MB 325.1 kB/s eta 0:00:35
---------------------------------------- 0.1/11.3 MB 465.5 kB/s eta 0:00:25
--------------------------------------- 0.1/11.3 MB 847.9 kB/s eta 0:00:14
--------------------------------------- 0.2/11.3 MB 1.0 MB/s eta 0:00:11
- -------------------------------------- 0.3/11.3 MB 1.2 MB/s eta 0:00:10
- -------------------------------------- 0.4/11.3 MB 1.3 MB/s eta 0:00:09
- -------------------------------------- 0.5/11.3 MB 1.4 MB/s eta 0:00:08
-- ------------------------------------- 0.7/11.3 MB 1.5 MB/s eta 0:00:08
-- ------------------------------------- 0.8/11.3 MB 1.6 MB/s eta 0:00:07
--- ------------------------------------ 0.9/11.3 MB 1.7 MB/s eta 0:00:07
--- ------------------------------------ 1.1/11.3 MB 1.8 MB/s eta 0:00:06
---- ----------------------------------- 1.2/11.3 MB 1.9 MB/s eta 0:00:06
---- ----------------------------------- 1.3/11.3 MB 1.9 MB/s eta 0:00:06
----- ---------------------------------- 1.4/11.3 MB 1.9 MB/s eta 0:00:06
----- ---------------------------------- 1.5/11.3 MB 2.0 MB/s eta 0:00:05
----- ---------------------------------- 1.6/11.3 MB 2.0 MB/s eta 0:00:05
------ --------------------------------- 1.8/11.3 MB 2.1 MB/s eta 0:00:05
------ --------------------------------- 2.0/11.3 MB 2.1 MB/s eta 0:00:05
------- -------------------------------- 2.1/11.3 MB 2.1 MB/s eta 0:00:05
-------- ------------------------------- 2.3/11.3 MB 2.3 MB/s eta 0:00:04
--------- ------------------------------ 2.6/11.3 MB 2.4 MB/s eta 0:00:04
--------- ------------------------------ 2.8/11.3 MB 2.5 MB/s eta 0:00:04
----------- ---------------------------- 3.2/11.3 MB 2.6 MB/s eta 0:00:04
----------- ---------------------------- 3.2/11.3 MB 2.6 MB/s eta 0:00:04
----------- ---------------------------- 3.3/11.3 MB 2.5 MB/s eta 0:00:04
------------ --------------------------- 3.6/11.3 MB 2.6 MB/s eta 0:00:03
------------ --------------------------- 3.6/11.3 MB 2.6 MB/s eta 0:00:03
------------- -------------------------- 3.8/11.3 MB 2.6 MB/s eta 0:00:03
-------------- ------------------------- 4.1/11.3 MB 2.7 MB/s eta 0:00:03
--------------- ------------------------ 4.5/11.3 MB 2.8 MB/s eta 0:00:03
---------------- ----------------------- 4.7/11.3 MB 2.8 MB/s eta 0:00:03
---------------- ----------------------- 4.8/11.3 MB 2.8 MB/s eta 0:00:03
------------------ --------------------- 5.1/11.3 MB 2.9 MB/s eta 0:00:03
------------------- -------------------- 5.5/11.3 MB 3.1 MB/s eta 0:00:02
------------------- -------------------- 5.7/11.3 MB 3.1 MB/s eta 0:00:02
-------------------- ------------------- 5.9/11.3 MB 3.1 MB/s eta 0:00:02
---------------------- ----------------- 6.3/11.3 MB 3.2 MB/s eta 0:00:02
---------------------- ----------------- 6.5/11.3 MB 3.2 MB/s eta 0:00:02
----------------------- ---------------- 6.8/11.3 MB 3.3 MB/s eta 0:00:02
------------------------- -------------- 7.2/11.3 MB 3.4 MB/s eta 0:00:02
-------------------------- ------------- 7.5/11.3 MB 3.5 MB/s eta 0:00:02
--------------------------- ------------ 7.8/11.3 MB 3.5 MB/s eta 0:00:02
---------------------------- ----------- 8.2/11.3 MB 3.6 MB/s eta 0:00:01
------------------------------ --------- 8.5/11.3 MB 3.7 MB/s eta 0:00:01
------------------------------ --------- 8.7/11.3 MB 3.7 MB/s eta 0:00:01
------------------------------- -------- 9.1/11.3 MB 3.8 MB/s eta 0:00:01
--------------------------------- ------ 9.4/11.3 MB 3.8 MB/s eta 0:00:01
---------------------------------- ----- 9.9/11.3 MB 3.9 MB/s eta 0:00:01
------------------------------------ --- 10.2/11.3 MB 3.9 MB/s eta 0:00:01
------------------------------------- -- 10.7/11.3 MB 4.4 MB/s eta 0:00:01
--------------------------------------- 11.1/11.3 MB 4.6 MB/s eta 0:00:01
--------------------------------------- 11.3/11.3 MB 4.7 MB/s eta 0:00:01
---------------------------------------- 11.3/11.3 MB 4.7 MB/s eta 0:00:00
Using cached numpy-2.3.3-cp311-cp311-win_amd64.whl (13.1 MB)
Using cached pytz-2025.2-py2.py3-none-any.whl (509 kB)
Using cached tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: pytz, tzdata, numpy, pandas
Successfully installed numpy-2.3.3 pandas-2.3.3 pytz-2025.2 tzdata-2025.2
[notice] A new release of pip is available: 24.0 -> 25.2 [notice] To update, run: python.exe -m pip install --upgrade pip
In [3]:
!pip install openpyxl
Collecting openpyxl Using cached openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB) Collecting et-xmlfile (from openpyxl) Using cached et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB) Using cached openpyxl-3.1.5-py2.py3-none-any.whl (250 kB) Using cached et_xmlfile-2.0.0-py3-none-any.whl (18 kB) Installing collected packages: et-xmlfile, openpyxl Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
[notice] A new release of pip is available: 24.0 -> 25.2 [notice] To update, run: python.exe -m pip install --upgrade pip
In [4]:
import pandas as pd
df = pd.read_excel('clinical_trial_patient_dataset.xlsx')
df = df.drop_duplicates()
df = df.replace(['NA', 'N/A', 'None', '', 'not mentioned', 'not available', 'unknown', 'absent'], pd.NA)
df = df.dropna() # or df = df.fillna(method='ffill') for filling missing values
df.columns = [col.lower().strip() for col in df.columns]
In [6]:
# df.describle()
df.describe()
Out[6]:
| age | bmi | systolic_bp | diastolic_bp | hemoglobin_g_dl | creatinine_mg_dl | glucose_mg_dl | visit_completion_rate | missed_visits | medication_adherence | data_quality_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 4500.000000 | 4500.000000 | 4500.000000 | 4500.000000 | 4500.000000 | 4500.000000 | 4500.000000 | 4500.000000 | 4500.000000 | 4500.000000 | 4500.000000 |
| mean | 54.276444 | 25.139889 | 124.275333 | 76.570444 | 12.401356 | 1.037151 | 100.129778 | 0.549178 | 1.348000 | 0.588802 | 79.869889 |
| std | 14.632067 | 7.958235 | 32.534530 | 21.703229 | 4.156810 | 0.384391 | 25.037816 | 0.385300 | 1.458703 | 0.408459 | 11.953774 |
| min | 18.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 9.000000 | 0.000000 | 0.000000 | 0.000000 | 19.300000 |
| 25% | 44.000000 | 21.500000 | 114.000000 | 68.000000 | 11.700000 | 0.850000 | 83.000000 | 0.000000 | 0.000000 | 0.000000 | 72.500000 |
| 50% | 54.000000 | 25.700000 | 129.000000 | 79.000000 | 13.200000 | 1.080000 | 101.000000 | 0.740000 | 1.000000 | 0.810000 | 81.900000 |
| 75% | 64.000000 | 29.900000 | 143.000000 | 90.000000 | 14.700000 | 1.290000 | 117.000000 | 0.860000 | 2.000000 | 0.910000 | 89.100000 |
| max | 85.000000 | 54.700000 | 199.000000 | 137.000000 | 21.900000 | 2.210000 | 198.000000 | 1.000000 | 9.000000 | 1.000000 | 99.900000 |
In [7]:
df.to_csv('cleaned_data_from_clinical_set.csv', index=False)
In [13]:
import matplotlib.pyplot as plt
stages = ['Screened', 'Enrolled', 'Randomized']
counts = [1000, 700, 600] # replace with your data
plt.figure(figsize=(8,5))
plt.bar(stages, counts, color='skyblue')
plt.title('Recruitment Funnel')
plt.xlabel('Stages')
plt.ylabel('Number of Patients')
plt.show()
In [16]:
# import seaborn as sns
# import pandas as pd
# # df with 'enrollment_date' column
# # df['enrollment_date'] = pd.to_datetime(df['enrollment_date'])
# # df_enroll = df.groupby('enrollment_date').size().reset_index(name='count')
# df['enrollment_date'] = df['enrollment_date'].replace('Not Mentioned ', pd.NaT)
# df['enrollment_date'] = pd.to_datetime(df['enrollment_date'], errors='coerce')
# plt.figure(figsize=(12,6))
# sns.lineplot(data=df_enroll, x='enrollment_date', y='count')
# plt.title('Enrollment Trend Over Time')
# plt.xlabel('Date')
# plt.ylabel('Enrollments')
# plt.show()
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
# Replace invalid dates and convert to datetime
df['enrollment_date'] = df['enrollment_date'].replace('Not Mentioned ', pd.NaT)
df['enrollment_date'] = pd.to_datetime(df['enrollment_date'], errors='coerce')
# Create df_enroll by grouping by enrollment_date and counting
df_enroll = df.groupby('enrollment_date').size().reset_index(name='count')
plt.figure(figsize=(12,6))
sns.lineplot(data=df_enroll, x='enrollment_date', y='count')
plt.title('Enrollment Trend Over Time')
plt.xlabel('Date')
plt.ylabel('Enrollments')
plt.show()
In [17]:
sites = ['Site A', 'Site B', 'Site C'] # replace with your site names
enrollments = [200, 150, 100] # replace with your data
plt.figure(figsize=(10,5))
plt.barh(sites, enrollments, color='lightgreen')
plt.title('Site Performance Leaderboard')
plt.xlabel('Number of Enrollments')
plt.ylabel('Sites')
plt.show()
In [28]:
# # df_pivot: pivot table with patients as rows, visits as columns, adherence as values
# import numpy as np
# df_pivot = df.pivot('patient_id', 'visit_number', 'adherence_flag')
# plt.figure(figsize=(12,8))
# sns.heatmap(df_pivot.fillna(0), cmap='YlGnBu', cbar=True)
# plt.title('Patient Adherence Heatmap')
# plt.xlabel('Visit Number')
# plt.ylabel('Patient ID')
# plt.show()
# import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
# df_pivot = df.pivot(index='patient_id', columns='visit_number', values='adherence_flag')
# plt.figure(figsize=(12,8))
# sns.heatmap(df_pivot.fillna(0), cmap='YlGnBu', cbar=True)
# plt.title('Patient Adherence Heatmap')
# plt.xlabel('Visit Number')
# plt.ylabel('Patient ID')
# plt.show()
# print(df.columns)
# import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
# # Load dataset
# # df = pd.read_csv('cleaned_data_from_clinical_set.csv')
# df_pivot = df.pivot(index='patient_id', columns='visit_number', values='adherence_flag')
# # Aggregate mean visit completion rate by site and gender
# df_grouped = df.groupby(['site_id', 'gender'])['visit_completion_rate'].mean().unstack()
# # df_pivot = df.pivot(index='patient_id', columns='visit_number', values='adherence_flag')
# plt.figure(figsize=(12,7))
# sns.heatmap(df_grouped, annot=True, fmt=".2f", cmap='YlGnBu', cbar_kws={'label': 'Avg Visit Completion Rate'})
# plt.title('Average Visit Completion Rate by Site and Gender')
# plt.xlabel('Gender')
# plt.ylabel('Site ID')
# plt.show()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load dataset
df = pd.read_csv('cleaned_data_from_clinical_set.csv')
# Aggregate mean visit completion rate by site and gender
df_grouped = df.groupby(['site_id', 'gender'])['visit_completion_rate'].mean().unstack()
plt.figure(figsize=(12,7))
sns.heatmap(df_grouped, annot=True, fmt=".2f", cmap='YlGnBu', cbar_kws={'label': 'Avg Visit Completion Rate'})
plt.title('Average Visit Completion Rate by Site and Gender')
plt.xlabel('Gender')
plt.ylabel('Site ID')
plt.show()
In [39]:
# import plotly.express as px
# dropout_counts = df['dropout_reason'].value_counts().reset_index()
# dropout_counts.columns = ['reason', 'count']
# fig = px.pie(dropout_counts, values='count', names='reason', title='Dropout Reasons')
# fig.show()
# import pandas as pd
# import plotly.express as px
# # Load the dataset
# df = pd.read_csv('cleaned_data_from_clinical_set.csv')
# # Calculate dropout reason counts
# dropout_counts = df['dropout_reason'].value_counts().reset_index()
# dropout_counts.columns = ['reason', 'count']
# # Create pie chart
# fig = px.pie(dropout_counts, values='count', names='reason', title='Dropout Reasons')
# fig.show()
import pandas as pd
import plotly.express as px
df = pd.read_csv('cleaned_data_from_clinical_set.csv')
# Filter out 'Not Mentioned'
filtered_df = df[df['dropout_reason'] != 'Not Mentioned']
dropout_counts = filtered_df['dropout_reason'].value_counts().reset_index()
dropout_counts.columns = ['reason', 'count']
fig = px.pie(dropout_counts, values='count', names='reason', title='Dropout Reasons (Excluding Not Mentioned)')
fig.show(renderer="notebook")
In [41]:
import pandas as pd
import plotly.express as px
# Load dataset
df = pd.read_csv('cleaned_data_from_clinical_set.csv')
# Create and show age distribution histogram
fig = px.histogram(df, x='age', nbins=30, title='Age Distribution')
fig.show(renderer="notebook")